results <- read_csv(file=file.path(getwd(), "compiled_results.csv"))
results$DKNN_method <- factor(results$DKNN_method, levels = c("None", "Conformal", "Normal"))
results
plot_bars <- function(df, dataset_name, col_name) {
col_name_pretty <- gsub("_", " ", col_name) %>% str_to_title()
ggplot(df, aes(fill=Model, y=get(col_name), x=DKNN_method)) +
geom_bar(position="dodge", stat="identity") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1), add = c(0, 0)),
limits = c(0, 1), name = col_name_pretty) +
scale_x_discrete(name = "DKNN Method") +
scale_fill_brewer(palette = "Set1") +
ggtitle(paste(str_to_title(dataset_name), "-", col_name_pretty)) +
geom_text(aes(label=round(get(col_name), 3)),
position=position_dodge(width=0.9), vjust=-0.25) +
theme_cowplot() +
theme(
plot.title = element_text(hjust = 0.5)
)
}
numeric_columns <- results %>% select_if(is_double) %>% colnames()
for (dataset_name in results$Dataset %>% unique()) {
res_subset = results %>% filter(Dataset == dataset_name)
for (col_name in numeric_columns) {
col_name_pretty <- gsub("_", " ", col_name) %>% str_to_title()
# for each dataset, compare model + method on multiple metrics
print(plot_bars(res_subset, dataset_name, col_name))
}
}
Not too clear on the interpretation of overall confidence score: higher = more faithful?
Most metrics relatively the same across models, deBerta slightly better than bart-large for both datasets.
From the graph results, it appears that DKNN for the most part do not degrade model performance, but could be significant for Founta F1 (does not happen for Toxigen).
plot_bars(results %>% filter(Dataset == "twitter-hate"), "twitter-hate", "eval_f1")
plot_bars(results %>% filter(Dataset == "twitter-hate"), "twitter-hate", "predict_f1")
Interestingly, this is because for Founta et. al. only, DKNN seems to drastically decrease recall (TP/Pred P, here recall refers to the percentage of all tweets that the model thinks are toxic and is actually toxic; fraction of relevant instances among the retrieved instances) in favor of precision (TP/ True P, here the percentage of all tweets that are toxic and actually classified as toxic; fraction of relevant instances that were retrieved).
This happens for both the Eval and Test set for Founta.
plot_bars(results %>% filter(Dataset == "twitter-hate"), "twitter-hate", "eval_precision")
plot_bars(results %>% filter(Dataset == "twitter-hate"), "twitter-hate", "eval_recall")
plot_bars(results %>% filter(Dataset == "twitter-hate"), "twitter-hate", "predict_precision")
plot_bars(results %>% filter(Dataset == "twitter-hate"), "twitter-hate", "predict_recall")
founta_bart_large_eval_conformal
founta_bart_large_eval_normal
founta_deberta_large_eval_conformal <- neighbor_dfs[[2]]
founta_deberta_large_eval_conformal
It's then natural to inquire: what are the % overlap/ how dissimilar are the nearest neighbors found between bart-large and deberta for the founta, toxigen dataset?
compute_neighbor_overlap_per_example <- function(i, df1, df2) {
set1 <- df1 %>% filter(eval_idx == i) %>% .$train_idx %>% unique()
set2 <- df2 %>% filter(eval_idx == i) %>% .$train_idx %>% unique()
intersection <- intersect(set1, set2)
set1_perc <- length(intersection) / length(set1)
set2_perc <- length(intersection) / length(set2)
return(c(set1_perc, set2_perc))
}
# takes soooo long, so use subsample instead
# N <- nrow(founta_eval)
N <- 100
res_matrix <- sapply(seq(1:N),
function(i) compute_neighbor_overlap_per_example(
i, founta_bart_large_eval_conformal,
founta_deberta_large_eval_conformal)
)
print(summary(res_matrix[1, 1:100])) # bart-large stats
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05319 0.12431 0.16775 0.18588 0.21451 0.62500
print(summary(res_matrix[2, 1:100])) # deberta-large stats
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0625 0.1502 0.2034 0.2163 0.2590 0.7407